

library(haven)
library(stargazer)

###
### IRT estimator
###
source("../IRT_mixing/mixture_irt.R")
#source("mixture_irt.R")

#load(file="all_results.RData") #reslist
#load(file="bigsurveys_recoded_plus_estimates.RData") #data
load(file="all_results4.RData") #reslist
data <- read.dta(file="bigsurveys_recoded4.dta")

policy_ind <- 3:318
years <- unique(data$source)

###
###  standardize estimated parameters
###

for(i in 1:length(reslist)){
    year <- years[i]
    print(years[i])
    
    sdx <- sd(reslist[[i]]$irt$x)
    meanx <- mean(reslist[[i]]$irt$x)
    reslist[[i]]$irt$a <- reslist[[i]]$irt$a +
           reslist[[i]]$irt$b*meanx
    reslist[[i]]$irt$b <- reslist[[i]]$irt$b*sdx
reslist[[i]]$irt$x <- (reslist[[i]]$irt$x-meanx)/sdx
}

###
### Evaluate dispersion in the discrimination parameters
###
for(i in 1:length(reslist)){
    year <- years[i]
    print(years[i])

    mat <- apply(data[data$source==year,policy_ind],2,as.numeric)
    exclude <- ! colnames(mat) %in% c("cces2012_immigration5","cces2012_immigration6")
    notna <- which(! is.na(colMeans(mat,na.rm=T)) & exclude)
    print(paste("items:",length(notna)))   
    mat <- mat[, notna]
    
    print(paste("max discrimination:",round(max(abs(reslist[[i]]$irt$b)),3)))
    ind <- which(abs(reslist[[i]]$irt$b) > 1/2*max(abs(reslist[[i]]$irt$b)))
    print(paste("items with > half discrimination of max:",length(ind)))
    print(colnames(mat)[ind])
    ind <- which(abs(reslist[[i]]$irt$b) > 1/3*max(abs(reslist[[i]]$irt$b)))
    print(paste("items with > 1/3 discrimination of max:",length(ind)))
    print(colnames(mat)[ind])
    print("--------------")
}

###
### What are the "real" number of items on each survey?
###

# looking at how many questions people actually answered
Ns <- c()
for(i in years){
    print(i)
    tmp <- data[data$source==i,policy_ind]
    notnas <- apply(tmp,1,function(x){length(which(! is.na(x)))})
    print(summary(notnas))
    Ns <- c(Ns,median(notnas,na.rm=T))
}
library(xtable)
xtable(cbind(years,Ns))

# in CCES 2014, how many people answered more than the median?
tmp <- data[data$source=="CCES_2014",policy_ind]
notnas <- apply(tmp,1,function(x){length(which(! is.na(x)))})
table(notnas > 32)
# it looks like most people were asked 32 questions and answered them

# now look at this by question to identify these 32 questions
notnas <- apply(tmp,2,function(x){length(which(! is.na(x)))})
summary(notnas)
table(notnas > 50000)
cces_2014_ind <- which(notnas > 50000)
cces_2014_names <- colnames(tmp)[cces_2014_ind]

mat <- apply(data[data$source=="CCES_2014",policy_ind],2,as.numeric)
exclude <- ! colnames(mat) %in% c("cces2012_immigration5","cces2012_immigration6")
notna <- which(! is.na(colMeans(mat,na.rm=T)) & exclude)
mat <- mat[, notna]
table(colnames(mat) %in% cces_2014_names)

mat <- mat[,which(colnames(mat) %in% cces_2014_names)]

#item_index <- colnames(mat) %in% cces_2014_names

system.time(res0 <- em_mix_irt(mat, iter=30))

# standardize estimates
sdx <- sd(res0$irt$x)
meanx <- mean(res0$irt$x)
res0$irt$a <- res0$irt$a + res0$irt$b*meanx
res0$irt$b <- res0$irt$b*sdx
res0$irt$x <- (res0$irt$x-meanx)/sdx

betas <- res0$irt$b
alphas <- res0$irt$a
xs <- res0$irt$x

n <- length(xs)
ntrials <- 3
Ms <- 10:(length(betas)-1)
trials <- rep(1:ntrials,length(Ms))
Ms <- rep(Ms,each=ntrials)
prop0 <- colMeans(res0$w)
#prop0 <- c(0.73805987,0.18086725,0.08107289)
est_props <- matrix(NA,3,length(Ms))
cor_w1 <- rep(NA,length(Ms))
cor_x <- rep(NA,length(Ms))
ind_list <- list()
res_list <- list()

for(i in 1:length(Ms)){
   ptm <- proc.time()
   m <- Ms[i]
   trial <- trials[i]
   print(paste(m,trial,sep=","))
   ind <- sample(1:length(betas),m)

   res <- em_mix_irt(mat[,ind], iter=30)

   # initial estimated proportions in each group
   print(prop0)
   est_props[,i] <- colMeans(res$w)
   cor_w1[i] <- cor(res$w[,1],res0$w[,1])
   print(est_props[,i])
   print(cor_w1[i])

   # spatial voter correlation with the initial estimates
   xcor <- cor(res$irt$x,xs)
   res$irt$x <- sign(xcor)*res$irt$x
   cor_x[i] <- cor(res$irt$x, xs)
   print(cor_x[i])
   print(proc.time() - ptm)

   res_list[[i]] <- res
   ind_list[[i]] <- ind
     
   save("est_props","cor_w1","cor_x","Ms","prop0","res_list","ind_list",
        file="simulation_output_alt2.RData")
}


pdf("simulation_cces_alt2.pdf")
ggplot(data=data.frame(cor_x=cor_x,M=Ms),
           aes(y=cor_x,x=M)) +
    geom_point() + geom_smooth()

ggplot(data=data.frame(cor_w1=cor_w1,M=Ms),
           aes(y=cor_w1,x=M)) +
    geom_point() + geom_smooth()

ggplot(data=data.frame(M=Ms,prop_spatial=est_props[1,],
                       prop_weighted=est_props[2,],
                       prop_random=est_props[3,])) +
    geom_point(aes(y=prop_spatial,x=M),color="red") +
    geom_point(aes(y=prop_weighted,x=M),color="blue") +
    geom_point(aes(y=prop_random,x=M),color="green") +
    geom_smooth(aes(y=prop_spatial,x=M),color="red") +
    geom_smooth(aes(y=prop_weighted,x=M),color="blue") +
    geom_smooth(aes(y=prop_random,x=M),color="green") +
    geom_hline(yintercept=prop0[1], linetype="dashed", color = "red") +
    geom_hline(yintercept=prop0[2], linetype="dashed", color = "blue") +
    geom_hline(yintercept=prop0[3], linetype="dashed", color = "green") 
dev.off()


margins <- colMeans(mat,na.rm=T)
sd_margins <- c()
for(i in 1:length(ind_list)){
    sd_margins <- c(sd_margins,
                    sd(margins[ind_list[[i]]]))
}

summary(lm(cor_w1~Ms))
summary(lm(cor_w1~sd_margins))
summary(lm(cor_w1~sd_margins*Ms))

mod1 <- lm(cor_w1~Ms)
mod2 <- lm(cor_w1~sd_margins)
mod3 <- lm(cor_w1~sd_margins*Ms)
stargazer(mod1,mod2,mod3)

summary(lm(cor_x~Ms))
summary(lm(cor_x~sd_margins*Ms))

mod1 <- lm(cor_x~Ms)
mod2 <- lm(cor_x~sd_margins)
mod3 <- lm(cor_x~sd_margins*Ms)
stargazer(mod1,mod2,mod3)

df <- data.frame(n_items=Ms,sd_margins=sd_margins,cor_x=cor_x,cor_w1=cor_w1)

write_dta(df,path="simulation_results_alt.dta")

